library(car)
library(mosaic)
library(DT)
library(tidyverse)
files <- dir()
rdat <- read.csv(grep(".csv", files, value=TRUE), header=TRUE)
as_tibble(rdat)
# A tibble: 230 x 6
y x1 x2 x3 x4 x5
<dbl> <int> <dbl> <dbl> <dbl> <int>
1 0.206 1 0.121 -7.62 1.41 1
2 0.173 0 0.288 6.66 1.24 0
3 -0.940 1 0.346 15.1 -3.41 0
4 -0.900 0 0.0425 14.0 -1.94 0
5 1.62 1 0.611 -1.38 4.12 1
6 0.253 0 0.309 3.66 5.53 1
7 -1.07 1 0.322 1.64 -3.67 0
8 -0.493 0 0.106 8.74 -2.86 0
9 0.802 1 0.206 8.61 4.76 0
10 0.538 0 0.227 6.40 5.39 1
# … with 220 more rows
pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))
lm1 <- lm(y ~ x4, data=rdat)
summary(lm1)
##
## Call:
## lm(formula = y ~ x4, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14252 -0.18749 0.02534 0.17983 0.75059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.083896 0.020633 4.066 6.59e-05 ***
## x4 0.191594 0.006012 31.870 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3124 on 228 degrees of freedom
## Multiple R-squared: 0.8167, Adjusted R-squared: 0.8159
## F-statistic: 1016 on 1 and 228 DF, p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=as.factor(rdat$x5))
lm1 <- lm(y ~ x4, data=rdat)
summary(lm1)
##
## Call:
## lm(formula = y ~ x4, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14252 -0.18749 0.02534 0.17983 0.75059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.083896 0.020633 4.066 6.59e-05 ***
## x4 0.191594 0.006012 31.870 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3124 on 228 degrees of freedom
## Multiple R-squared: 0.8167, Adjusted R-squared: 0.8159
## F-statistic: 1016 on 1 and 228 DF, p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=interaction(rdat$x1,rdat$x5))
lm2 <- lm(y ~ x4*x1*x5, data=rdat)
summary(lm2)
##
## Call:
## lm(formula = y ~ x4 * x1 * x5, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98128 -0.18428 0.01031 0.20109 0.67184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.071795 0.038542 1.863 0.063818 .
## x4 0.213627 0.011092 19.259 < 2e-16 ***
## x1 -0.046025 0.054332 -0.847 0.397849
## x5 0.004851 0.056237 0.086 0.931343
## x4:x1 -0.013168 0.015555 -0.847 0.398171
## x4:x5 -0.062079 0.015917 -3.900 0.000127 ***
## x1:x5 0.134686 0.079941 1.685 0.093429 .
## x4:x1:x5 0.059754 0.023540 2.538 0.011820 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3009 on 222 degrees of freedom
## Multiple R-squared: 0.8343, Adjusted R-squared: 0.8291
## F-statistic: 159.7 on 7 and 222 DF, p-value: < 2.2e-16
lm3 <- lm(y ~ x4 + x4:x5, data=rdat)
summary(lm3)
##
## Call:
## lm(formula = y ~ x4 + x4:x5, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0730 -0.1877 0.0247 0.1681 0.8251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08497 0.02034 4.178 4.2e-05 ***
## x4 0.20634 0.00795 25.955 < 2e-16 ***
## x4:x5 -0.03310 0.01190 -2.782 0.00586 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3078 on 227 degrees of freedom
## Multiple R-squared: 0.8227, Adjusted R-squared: 0.8212
## F-statistic: 526.7 on 2 and 227 DF, p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","steelblue","red","green3","black"))
pairs(cbind(R=lm3$res, Fit=lm3$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=as.factor(rdat$x5))
I played with a few ideas until I got this…
lm4 <- lm(y ~ x4 + x4:x5 +
x1:x5 + x1:x4:x5, data=rdat)
summary(lm4)
##
## Call:
## lm(formula = y ~ x4 + x4:x5 + x1:x5 + x1:x4:x5, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96151 -0.18246 0.01235 0.18386 0.69161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.056870 0.022576 2.519 0.01246 *
## x4 0.206687 0.007752 26.661 < 2e-16 ***
## x4:x5 -0.055140 0.013775 -4.003 8.49e-05 ***
## x5:x1 0.108436 0.047561 2.280 0.02355 *
## x4:x5:x1 0.046587 0.017622 2.644 0.00878 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3002 on 225 degrees of freedom
## Multiple R-squared: 0.8329, Adjusted R-squared: 0.83
## F-statistic: 280.5 on 4 and 225 DF, p-value: < 2.2e-16
rdat2 <- cbind(rdat, fit=lm4$fitted.values)
ggplot(rdat2, aes(x=x4, color=interaction(x1,x5))) +
geom_point(aes(y=y), pch=1) +
geom_point(aes(y=fit), pch=16, cex=0.5) +
geom_smooth(aes(y=y), method="lm") +
facet_wrap(~interaction(x1,x5))
## `geom_smooth()` using formula 'y ~ x'
final.lm <- lm(y ~ x4 + x4:x5 +
x1:x5 + x1:x4:x5, data=rdat)
summary(final.lm)
##
## Call:
## lm(formula = y ~ x4 + x4:x5 + x1:x5 + x1:x4:x5, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96151 -0.18246 0.01235 0.18386 0.69161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.056870 0.022576 2.519 0.01246 *
## x4 0.206687 0.007752 26.661 < 2e-16 ***
## x4:x5 -0.055140 0.013775 -4.003 8.49e-05 ***
## x5:x1 0.108436 0.047561 2.280 0.02355 *
## x4:x5:x1 0.046587 0.017622 2.644 0.00878 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3002 on 225 degrees of freedom
## Multiple R-squared: 0.8329, Adjusted R-squared: 0.83
## F-statistic: 280.5 on 4 and 225 DF, p-value: < 2.2e-16
palette(c("skyblue","skyblue","green","purple","steelblue","red","green3","black"))
plot(y ~ x4, data=rdat, col=interaction(x1,x5))
points(final.lm$fit ~ x4, data=rdat, col=interaction(x1,x5), pch=16, cex=0.5)
b <- coef(final.lm)
x1=0; x5=0;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[1])
x1=0; x5=1;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[3])
x1=1; x5=1;
curve(b[1] + b[2]*x4 + b[3]*x4*x5 + b[4]*x5*x1 + b[5]*x4*x5*x1, add=TRUE, xname="x4", col=palette()[4])
\[ Y_i = \beta_0 + \beta_1 X_{4i} + \beta_2 X_{4i} X_{5i} + \beta_3 X_{5i}X_{1i} + \beta_4 X_{4i} X_{5i} X_{1i} + \epsilon_i \]